package is2.lemmatizer;


import is2.data.Cluster;
import is2.data.F2SF;
import is2.data.FV;
import is2.data.Instances;
import is2.data.InstancesTagger;
import is2.data.Long2Int;
import is2.data.ParametersFloat;
import is2.data.PipeGen;
import is2.data.SentenceData09;
import is2.io.CONLLReader09;
import is2.io.CONLLWriter09;
import is2.tools.IPipe;
import is2.tools.Tool;
import is2.tools.Train;
import is2.util.DB;
import is2.util.OptionsSuper;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;



public class Lemmatizer implements Tool, Train {

	public Pipe pipe;
	public ParametersFloat params;
	private Long2Int li;
	
	private boolean doUppercase=false;

	private long[] vs= new long[40];



	/**
	 * Creates a lemmatizer due to the model stored in modelFileName
	 * @param modelFileName the path and file name to a lemmatizer model
	 */
	public Lemmatizer(String modelFileName)  {
		
		// tell the lemmatizer the location of the model
		try {
			Options m_options = new Options(new String[] {"-model", modelFileName});
			li = new Long2Int(m_options.hsize);

			// initialize the lemmatizer
			readModel(m_options);

		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	




	public Lemmatizer(boolean doUppercase) {this.doUppercase=doUppercase; }



	public static void main (String[] args) throws FileNotFoundException, Exception
	{

		Options options = new Options(args);
		Lemmatizer lemmatizer = new Lemmatizer(options.upper);

		long start = System.currentTimeMillis();
		

		if (options.train) {

			
			lemmatizer.li = new Long2Int(options.hsize);
			lemmatizer.pipe =  new Pipe (options,lemmatizer.li);

			InstancesTagger is = lemmatizer.pipe.createInstances(options.trainfile);

			DB.println("Features: " + lemmatizer.pipe.mf.size()+" Operations "+lemmatizer.pipe.mf.getFeatureCounter().get(Pipe.OPERATION));   

			ParametersFloat params = new ParametersFloat(lemmatizer.li.size());

			lemmatizer.train(options,lemmatizer.pipe,params,is);

			lemmatizer.writeModel(options, lemmatizer.pipe, params);
		}

		if (options.test) {

			lemmatizer.readModel(options);

			lemmatizer.out(options,lemmatizer.pipe, lemmatizer.params);
		}

		System.out.println();

		if (options.eval) {
			System.out.println("\nEVALUATION PERFORMANCE:");
			Evaluator.evaluate(options.goldfile, options.outfile,options.format);
		}
		long end = System.currentTimeMillis();
		System.out.println("used time "+((float)((end-start)/100)/10));
	}

	/* (non-Javadoc)
	 * @see is2.tools.Train#writeModel(is2.util.OptionsSuper, is2.tools.IPipe, is2.data.ParametersFloat)
	 */
	@Override
	public void writeModel(OptionsSuper options, IPipe pipe,
			ParametersFloat params) {
		try {
			// store the model
			ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName)));
			zos.putNextEntry(new ZipEntry("data")); 
			DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos));
			
			this.pipe.mf.writeData(dos);

			dos.flush();
			params.write(dos);

			pipe.write(dos);
			
			dos.writeBoolean(this.doUppercase);
			
			dos.flush();
			dos.close(); 
		} catch(Exception e) {
			e.printStackTrace();
		}
	}


	public void readModel(OptionsSuper options) {

		try {

			// load the model
			ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName)));
			zis.getNextEntry();
			DataInputStream dis = new DataInputStream(new BufferedInputStream(zis));

			MFO mf = new MFO();
			mf.read(dis);
			params = new ParametersFloat(0);
			params.read(dis);
			li =new Long2Int(params.size());
			pipe = new Pipe(options, li);
			pipe.mf =mf;

			pipe.initFeatures();
			pipe.initValues();

			pipe.readMap(dis);

			for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) {
				this.pipe.types[e.getValue()] = e.getKey();
				//	System.out.println("set pos "+e.getKey());
			}

			
			pipe.cl = new Cluster(dis);

			if (dis.available()>0) this.doUppercase = dis.readBoolean();
		
			
			dis.close();
			DB.println("Loading data finished. ");

			DB.println("number of params  "+params.parameters.length);
			DB.println("number of classes "+pipe.types.length);

		} catch (Exception e ) {
			e.printStackTrace();
		}

	}



	/**
	 * Do the training
	 * @param instanceLengths
	 * @param options
	 * @param pipe
	 * @param params
	 * @param li 
	 * @throws IOException
	 * @throws InterruptedException
	 * @throws ClassNotFoundException
	 */
	public void train(OptionsSuper options, IPipe p, ParametersFloat params, Instances ist) {

		InstancesTagger is = (InstancesTagger)ist;
		
		int i = 0,del=0; 
		FV g = new FV(), f = new FV();
		
		int LC = this.pipe.types.length+1, UC = LC+1;

		String wds[] = MFO.reverse(pipe.mf.getFeatureSet().get(Pipe.WORD));
		
		F2SF fs = params.getFV();
		double upd=0;

		for(i = 0; i < options.numIters; i++) {

			System.out.print("Iteration "+i+": ");

			long start = System.currentTimeMillis();
			int numInstances = is.size();
			int correct =0,count=0;

			long last= System.currentTimeMillis();
			int wrongOp=0,correctOp=0, correctUC=0, wrongUC=0;

			HashMap<String,Integer> map = new HashMap<String,Integer>(); 

			for(int n = 0; n < numInstances; n++) {

				if((n+1) % 500 == 0) del= Pipe.outValueErr(n+1, (float)(count-correct),(float)correct/(float)count,del,last,upd);

				upd = (double)(options.numIters*numInstances - (numInstances*i+(n+1))+ 1);

				for(int k = 0; k < is.length(n); k++) {

					double best = -1000;
					String bestOp="";



					count++;
					pipe.addCoreFeatures(is, n, k, 0,wds[is.forms[n][k]], vs);

					String lemma = pipe.opse.get(wds[is.forms[n][k]].toLowerCase());


					// predict
					if (lemma==null)
						for(int t = 0; t < pipe.types.length; t++) {

							fs.clear();
							for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(t*Pipe.s_type)));

							float score = (float) fs.getScore();
							if (score >best) {
								bestOp = pipe.types[t];
								best =score;
							}
						}

					if (doUppercase) {
						fs.clear();
						for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(LC*Pipe.s_type)));

						int correctOP =-1, selectedOP =-1;	
						if (wds[is.glemmas[n][k]].length()>0 &&
								Character.isUpperCase(wds[is.glemmas[n][k]].charAt(0)) &&
								fs.score > 0) {

							correctOP = UC;
							selectedOP =LC;
						}  else if (wds[is.glemmas[n][k]].length()>0 
								&&Character.isLowerCase(wds[is.glemmas[n][k]].charAt(0)) &&
								fs.score <= 0) {


							correctOP = LC;
							selectedOP =UC;
						}

						if (correctOP!=-1 && wds[is.glemmas[n][k]].length()>0) {

							wrongUC++;
							f.clear();
							for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) f.add(li.l2i(vs[l]+(selectedOP*Pipe.s_type)));

							g.clear();							
							for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) g.add(li.l2i(vs[l]+(correctOP*Pipe.s_type)));

							double lam_dist = params.getScore(g) - params.getScore(f);//f
							double loss = 1 - lam_dist;

							FV dist = g.getDistVector(f);						
							dist.update(params.parameters, params.total, params.update(dist,loss), upd,false); 

						} else {
							correctUC++;
						}
					}
					if (lemma!=null) {
						correct++;
						correctOp++;
						continue;
					}


					String op = Pipe.getOperation(is,n, k,wds);
					if (op.equals(bestOp) ) {
						correct++;
						correctOp++;
						continue;
					}
					wrongOp++;

					f.clear();
					int bop =pipe.mf.getValue(Pipe.OPERATION, bestOp);
					for(int r=vs.length-1;r>=0;r--) if (vs[r]>0)f.add(li.l2i(vs[r]+(bop*Pipe.s_type)));

					g.clear();
					int gop =pipe.mf.getValue(Pipe.OPERATION, op);
					for(int r=vs.length-1;r>=0;r--) if (vs[r]>0)g.add(li.l2i(vs[r]+(gop*Pipe.s_type)));
					double lam_dist = params.getScore(g) - params.getScore(f);//f

					double loss = 1 - lam_dist;

					FV dist = g.getDistVector(f);

					dist.update(params.parameters, params.total, params.update(dist,loss), upd,false); //0.05

				}

			}
			ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>();
			for(Entry<String, Integer> e : map.entrySet()) {
				if(e.getValue()>1) {
					opsl.add(e);
				}
			}

			Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){
				@Override
				public int compare(Entry<String, Integer> o1,
						Entry<String, Integer> o2) {

					return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?1:-1;
				}
			});

			if (opsl.size()>0) System.out.println();	
			for(Entry<String, Integer> e : opsl) {
				System.out.println(e.getKey()+"  "+e.getValue());		
			}
			map.clear();

			del= Pipe.outValueErr(numInstances, (float)(count-correct), (float)correct/(float)count,del,last,upd, 
					"time "+(System.currentTimeMillis()-start)+
					" corr/wrong "+correctOp+" "+wrongOp+" uppercase corr/wrong  "+correctUC+" "+wrongUC);
			del=0;
			System.out.println();			
		}

		params.average(i*is.size());

	}


	/**
	 * Do the work
	 * @param options
	 * @param pipe
	 * @param params
	 * @throws IOException
	 */
	public void out (OptionsSuper options, IPipe pipe, ParametersFloat params)  {

		long start = System.currentTimeMillis();

		CONLLReader09 depReader = new CONLLReader09(options.testfile, CONLLReader09.NO_NORMALIZE);
		depReader.setInputFormat(options.formatTask);
		CONLLWriter09 depWriter = new CONLLWriter09(options.outfile);
		depWriter.setOutputFormat(options.formatTask);

		System.out.print("Processing Sentence: ");

		int cnt = 0;
		int del=0;

		try {

			while(true) {

				InstancesTagger is = new InstancesTagger();

				is.init(1, new MFO());
				SentenceData09 instance = depReader.getNext(is);//pipe.nextInstance(null, depReader);

				if (instance==null) break;
				is.fillChars(instance, 0, Pipe._CEND);
				cnt++;
				SentenceData09 i09 =lemmatize(is, instance, this.li);
				
				if(options.normalize) for(int k=0;k<i09.length();k++) {
					boolean save = depReader.normalizeOn;
					depReader.normalizeOn =true;
					i09.plemmas[k] = depReader.normalize(i09.plemmas[k]);
					depReader.normalizeOn = save;
				}
				
				if (options.overwritegold)  i09.lemmas = i09.plemmas;
				
				
				
				 depWriter.write(i09);

				 if (cnt%100 ==0) del=Pipe.outValue(cnt, del);

			}
			depWriter.finishWriting();
			del=Pipe.outValue(cnt, del);
			long end = System.currentTimeMillis();

			System.out.println(PipeGen.getSecondsPerInstnace(cnt,(end-start)));
			System.out.println(PipeGen.getUsedTime(end-start));
		} catch(Exception e) {
			e.printStackTrace();
		}
	}


	private SentenceData09 lemmatize(InstancesTagger is, SentenceData09 instance, Long2Int li) {

		int LC = pipe.types.length+1;

		is.feats[0] = new short[instance.length()][11];

		is.fillChars(instance, 0, Pipe._CEND);

		int length = instance.length();

		F2SF fs = new F2SF(params.parameters);						


		for(int w1 = 0; w1 < length; w1++) {
			instance.plemmas[w1]="_";
			pipe.addCoreFeatures(is, 0, w1, 0,instance.forms[w1], vs);

			String f =null;
			if (is.forms[0][w1]!=-1) {
				f = pipe.opse.get(instance.forms[w1].toLowerCase());
				if (f!=null) {
					instance.plemmas[w1]=f;
				}
			} 
			double best = -1000.0;
			int bestOp=0;

			for(int t = 0; t < pipe.types.length; t++) {

				fs.clear();
				for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(t*Pipe.s_type)));

				if (fs.score >=best) {
					best =fs.score;
					bestOp=t;
				}		
			}
			//instance.ppos[w1]=""+bestOp;
			if (f==null) instance.plemmas[w1] = StringEdit.change((doUppercase?instance.forms[w1]:instance.forms[w1].toLowerCase()),pipe.types[bestOp]);

			// check for empty string
			if(instance.plemmas[w1].length()==0) instance.plemmas[w1] = "_";

			if(doUppercase){
				fs.clear();
				for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(LC*Pipe.s_type)));


				try {

					if (fs.score<=0 && instance.plemmas[w1].length()>1) {
						instance.plemmas[w1] = Character.toUpperCase(instance.plemmas[w1].charAt(0))+instance.plemmas[w1].substring(1);
					} else if (fs.score<=0 && instance.plemmas[w1].length()>0) {
						instance.plemmas[w1] = String.valueOf(Character.toUpperCase(instance.plemmas[w1].charAt(0)));
					} else if (fs.score>0) {
						instance.plemmas[w1] = instance.plemmas[w1].toLowerCase();
					}

				} catch(Exception e){
					e.printStackTrace();
					//	System.out.println("error "+pipe.types[bestOp]+" "+instance.forms[w1]);
				}
			}
		}


		SentenceData09 i09 = new SentenceData09(instance);
		i09.createSemantic(instance);
		return i09;
	}


	/* (non-Javadoc)
	 * @see is2.tools.Tool#apply(is2.data.SentenceData09)
	 */
	@Override
	public SentenceData09 apply(SentenceData09 snt) {
		InstancesTagger is = new InstancesTagger();
		
		// be robust
		if (snt.length()== 0) return snt; 
		
		SentenceData09 it = new SentenceData09();
		it.createWithRoot(snt);
		
		
		is.init(1, new MFO());
		is.createInstance09(it.length());
		is.fillChars(it, 0, Pipe._CEND);

		for(int j = 0; j < it.length(); j++) is.setForm(0, j, it.forms[j]);

		return lemmatize(is, it,li);
	}
	

	
}
